# -*- coding: utf-8 -*-
"""
Code Introduction:
    This code 
Version History:
    Created: Tue Dec 29 10:28:44 2020
    Current: 

@author: Xing Guo (guoxing.econ@gmail.com)

"""

#%% Import Moduels

## I/O Tools
import os
import _pickle as pickle
import warnings

## Data Process Tools
import pandas as pd
import datetime
from fredapi import Fred

## Graphical
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf as figpdf

## Numerical
import numpy as np

## Statistical
import statsmodels.formula.api as sm
import statsmodels.api as SMAPI

idx = pd.IndexSlice
fred = Fred(api_key='86cde3dec5dda5ffca44b58f01838b1e')
# End of Section: Import Moduels
###############################################################################


#%% Setup Working Directory

## Windows System Path
FolderList = ["E:\\Dropbox (Bank of Canada)\\Research Projects\\OHANK\\", \
              "B:\\Dropbox (Bank of Canada)\\Research Projects\\OHANK\\", \
              "D:\\Dropbox (Bank of Canada)\\Research Projects\\OHANK\\"]
for Folder in FolderList:
    WorkDir = Folder+"Empirics\\Analysis_Aggregate\\"
    if os.path.exists(WorkDir):
        os.chdir(WorkDir)    

## Output Folder
OutputFolder = 'TableGraph'
if not os.path.exists(OutputFolder):
    os.makedirs(OutputFolder)
# End of Section: Setup Working Directory
###############################################################################


#%% Helper Functions


## Generate the Quarter Date
# based on (Year, Quarter) Combination
def YQ2QDate(year,quarter):
    month = 3*(quarter-1)+1
    return datetime.datetime(int(year),int(month),1)
# based on the Exact Date
def Date2QDate(date):
    return datetime.datetime(date.year,int(np.floor((date.month-1)/3))*3+1,1)

## Read the Cleaned Stata Files
def UnitReadin(dta_Name):
    # 1. Read the data file
    TempDir = "..\\Data\\Aggregate\\Cleaned\\"
    DS = pd.read_stata(TempDir+dta_Name)
    # 2. Rename the variables and check the variables
    DS = DS.rename(columns={xx: xx.lower() for xx in DS.columns})
    List_Var_DS = DS.columns.tolist()
    List_Var_Check = ['year','quarter','geo']
    for vv in List_Var_Check:
        if not (vv in List_Var_DS):
            print("Missing variable "+vv+' in '+dta_Name+'\n')
            return []
    # 3. Create the date variable
    DS['qdate'] = DS.apply(lambda x: YQ2QDate(x['year'],x['quarter']),axis=1)
    # 4. Sort the data
    List_Var_Ordered = ['year','quarter','geo','qdate']+ \
                       list(set(DS.columns)-set(['year','quarter','geo','qdate']))
    DS = DS[List_Var_Ordered].sort_values(by='qdate').reset_index(drop=True)
    # 5. Rename the Variables
    Dict_Var_Name = {'year': 'Year', 'quarter': 'Quarter', 'geo': 'Geo', 'qdate': 'QDate'}
    DS = DS.sort_values(by=list(Dict_Var_Name.keys())) \
            .reset_index(drop=True).rename(columns=Dict_Var_Name)

    return DS

## Level to Log/Lin and Growth
def LogLin_Growth(DS,VarDict=None,LogLin='Log',SA='SA'):
    if isinstance(DS, pd.Series):
        DS = DS.to_frame()
    if VarDict==None:
        VarDict = {xx: xx for xx in DS.columns}
    for vv in VarDict.keys():
        # Level: Log or Lin 
        if LogLin=='Log':
            DS[vv] = np.log(DS[VarDict[vv]])*100
        elif LogLin=='Lin':
            DS[vv] = DS[VarDict[vv]]
        # Seasonality Adjustment    
        if SA=='SA':
            Temp = SMAPI.tsa.x13_arima_analysis(DS[vv].dropna(),log=False)
            DS[vv] = Temp.seasadj
        # Difference
        DS['Diff_'+vv] = DS[vv].diff()
    
    return DS

## Visual Check and Cleaning
def CheckClean(DS,VarDict,Label=''):
    # Visual Check
    TempVarList_1 = list(VarDict.keys())
    TempVarList_2 = ['Diff_'+vv for vv in VarDict.keys()]
    DS[TempVarList_1].plot(title=Label+": Level")
    DS[TempVarList_2].plot(title=Label+": Change")
    # Drop the Unnecessary Variables
    TempVarList = ['Geo','Year','Quarter']+TempVarList_1+TempVarList_2
    DS = DS[TempVarList]
    
    return DS

#%% Monetary Policy Shocks

## Canada
DS_MS_CA = pd.read_excel(open("../Data/Canada_MonetaryShocks.xlsx","rb"),sheet_name='Version_2') \
           .rename(columns={'Canada shocks': 'MS_CA'})

DS_MS_CA['QDate'] = DS_MS_CA['Date'].map(Date2QDate)
DS_MS_CA_Q = DS_MS_CA.groupby('QDate')['MS_CA'].sum()

## US
DS_MS_US = pd.read_stata(open("../Data/US_MonetaryShocks.dta","rb")) \
            .rename(columns={'date': 'QDate', 'resid_full': 'MS_US'})
DS_MS_US_Q = DS_MS_US.set_index('QDate')['MS_US']

## Merged
DS_MS = pd.concat([DS_MS_US_Q,DS_MS_CA_Q],join='outer',axis=1)

pickle.dump(DS_MS,open('Temp/MS.py','wb'))

#%% Aggregate Variables: Canada

## NIPA: Components of GDP (in 2012 $)
DS_NIPA_CA = pd.read_csv(open("../Data/GDP.csv","rb"))
TempDict = {"REF_DATE": 'Date',"Estimates": 'Variable',"VALUE": 'Value', \
            'Seasonal adjustment': 'SA'}
DS_NIPA_CA = DS_NIPA_CA[list(TempDict.keys())].rename(columns=TempDict)
DS_NIPA_CA = DS_NIPA_CA[DS_NIPA_CA['SA']=='Seasonally adjusted at annual rates'] \
             .set_index(['Date','Variable'])['Value'].unstack(level=1)
VarDict = { 'Gross domestic product at market prices': 'GDP', \
            'Household final consumption expenditure': 'Consumption', \
            'Exports of goods and services': 'Export', \
            'Less: imports of goods and services': 'Import', \
            'Residential structures': 'ResidentialStructure'}
DS_NIPA_CA = DS_NIPA_CA[list(VarDict.keys())].rename(columns=VarDict)
DS_NIPA_CA.index = pd.to_datetime(DS_NIPA_CA.index).rename('QDate')
DS_NIPA_CA['NetExportIndex'] = (DS_NIPA_CA['Export']-DS_NIPA_CA['Import'])/DS_NIPA_CA['GDP']*100
# GDP
DS_GDP_CA = LogLin_Growth(DS_NIPA_CA['GDP'],SA='NSA')

# Total Consumption
DS_Cons_CA = LogLin_Growth(DS_NIPA_CA['Consumption'],SA='NSA')
# Export
DS_Export_CA = LogLin_Growth(DS_NIPA_CA['Export'],SA='NSA')
# Import
DS_Import_CA = LogLin_Growth(DS_NIPA_CA['Import'],SA='NSA')
# Net Export
DS_NetExportIdx_CA = LogLin_Growth(DS_NIPA_CA['NetExportIndex'],LogLin='Lin',SA='NSA')

## Prices
# Nominal Rate
DS_NomRate_CA = fred.get_series('IR3TIB01CAQ156N').rename('NominalRate')
DS_NomRate_CA = LogLin_Growth(DS_NomRate_CA,LogLin='Lin')
# CPI and Inflation
DS_CPI_CA_M = pd.read_csv(open("../Data/CPI.csv",'rb'))
TempDict = {"REF_DATE": 'Date','Products and product groups': 'Variable',"VALUE": 'Value'}
DS_CPI_CA_M = DS_CPI_CA_M[list(TempDict.keys())].rename(columns=TempDict) \
                 .set_index(['Date','Variable'])['Value'].unstack(level=1)
VarDict = {'All-items': 'CPI'}
DS_CPI_CA_M = DS_CPI_CA_M[list(VarDict.keys())].rename(columns=VarDict)
DS_CPI_CA_M.index = pd.to_datetime(DS_CPI_CA_M.index)
DS_CPI_CA_M['CPI'] = DS_CPI_CA_M['CPI']/DS_CPI_CA_M.loc[datetime.datetime(2012,1,1),'CPI']
DS_CPI_CA_M['Inflation'] = np.log(DS_CPI_CA_M['CPI']).diff()*100

DS_CPI_CA_M['QDate'] = DS_CPI_CA_M.index.map(Date2QDate)
CPI_Q = DS_CPI_CA_M.groupby('QDate')['CPI'].mean()
DS_CPI_CA = DS_CPI_CA_M.groupby('QDate')[['Inflation']].mean()*3*4
DS_CPI_CA = LogLin_Growth(DS_CPI_CA['Inflation'],LogLin='Lin')
DS_CPI_CA['CPI'] = DS_CPI_CA['Inflation'].cumsum()/4
DS_CPI_CA['Diff_CPI'] = DS_CPI_CA['Inflation']/4

# Labor Income
DS_LaborIncome_CA = pd.read_csv(open("../Data/LaborIncome.csv",'rb'))
TempDict = {"REF_DATE": 'Date',"Estimates": 'Variable',"VALUE": 'Value', \
            'Seasonal adjustment': 'SA'}
DS_LaborIncome_CA = DS_LaborIncome_CA[DS_LaborIncome_CA['Seasonal adjustment']=='Seasonally adjusted at annual rates']
DS_LaborIncome_CA = DS_LaborIncome_CA[list(TempDict.keys())].rename(columns=TempDict) \
                 .set_index(['Date','Variable'])['Value'].unstack(level=1)
VarDict = {'Equals: compensation of employees paid by resident entities': 'LaborIncome'}
DS_LaborIncome_CA = DS_LaborIncome_CA[list(VarDict.keys())].rename(columns=VarDict)
DS_LaborIncome_CA.index = pd.to_datetime(DS_LaborIncome_CA.index)

DS_LaborIncome_CA = DS_LaborIncome_CA.merge(CPI_Q,how='left',left_index=True,right_index=True)
DS_LaborIncome_CA = (DS_LaborIncome_CA['LaborIncome']/DS_LaborIncome_CA['CPI']) \
                    .rename('LaborIncome').dropna().to_frame()
DS_LaborIncome_CA = LogLin_Growth(DS_LaborIncome_CA['LaborIncome'],LogLin='Log')
# Exchange Rate
DS_Ex = fred.get_series('DEXCAUS').rename('ExchangeRate').to_frame()
DS_Ex['Depreciation'] = np.log(DS_Ex['ExchangeRate']).diff()*100
DS_Ex['QDate'] = DS_Ex.index.map(Date2QDate)
DS_Ex = (DS_Ex.groupby('QDate')['Depreciation'].mean()*3*4).to_frame()
DS_Ex = LogLin_Growth(DS_Ex['Depreciation'],LogLin='Lin')
DS_Ex['ExchangeRate'] = DS_Ex['Depreciation'].cumsum()/4
DS_Ex['Diff_ExchangeRate'] = DS_Ex['Depreciation']/4

# Commodity Prices
DS_ComPrice = pd.read_csv(open("../Data/CommodityPrice.csv",'rb'))
TempDict = {"REF_DATE": 'Date',"Commodity": 'Variable',"VALUE": 'Value'}
DS_ComPrice = DS_ComPrice[list(TempDict.keys())].rename(columns=TempDict) \
                 .set_index(['Date','Variable'])['Value'].unstack(level=1)
VarDict = {'Total, all commodities': 'CommodityPrice'}
DS_ComPrice = DS_ComPrice[list(VarDict.keys())].rename(columns=VarDict)
DS_ComPrice.index = pd.to_datetime(DS_ComPrice.index)

DS_ComPrice = DS_ComPrice.merge(DS_CPI_CA_M,how='left',left_index=True,right_index=True)
DS_ComPrice = (DS_ComPrice['CommodityPrice']/DS_ComPrice['CPI']) \
              .rename('CommodityPrice').dropna().to_frame()
DS_ComPrice['QDate'] = DS_ComPrice.index.map(Date2QDate)
DS_ComPrice = DS_ComPrice.groupby('QDate')[['CommodityPrice']].mean()
DS_ComPrice = LogLin_Growth(DS_ComPrice,LogLin='Log')

# Household Balance Sheet Aggregate Figures
DS_HhBS_Asset = pd.read_csv(open("../Data/HHBS_Asset.csv",'rb'))
DS_HhBS_Liability = pd.read_csv(open("../Data/HHBS_Liability.csv",'rb'))

TempDict = {"REF_DATE": 'Date',"Categories": 'Variable',"VALUE": 'Value'}
DS_HhBS_Asset = DS_HhBS_Asset[list(TempDict.keys())].rename(columns=TempDict) \
                .set_index(['Date','Variable'])['Value'].unstack(level=1)
DS_HhBS_Asset['QDate'] = pd.to_datetime(DS_HhBS_Asset.index)
DS_HhBS_Asset = DS_HhBS_Asset.set_index('QDate')
DS_HhBS_Asset['CurrencyDeposit_Foreign'] = DS_HhBS_Asset['Foreign currency and deposits']
DS_HhBS_Asset['CurrencyDeposit_Domestic'] = DS_HhBS_Asset['Canadian currency and deposits']
DS_HhBS_Asset['Bond_Domestic'] = DS_HhBS_Asset['Canadian short-term paper']+DS_HhBS_Asset['Canadian bonds and debentures']
DS_HhBS_Asset['Bond_Foreign'] = DS_HhBS_Asset['Foreign investments: short-term paper']+DS_HhBS_Asset['Foreign investments: bonds']
DS_HhBS_Asset['Total_Domestic'] = DS_HhBS_Asset[['CurrencyDeposit_Domestic','Bond_Domestic']].sum(axis=1)
DS_HhBS_Asset['Total_Foreign'] = DS_HhBS_Asset[['CurrencyDeposit_Foreign','Bond_Foreign']].sum(axis=1)

DS_HhBS_Asset = DS_HhBS_Asset[['_'.join([xx,yy]) for xx in ['CurrencyDeposit','Bond','Total'] for yy in ['Domestic','Foreign']]]

DS_HhBS_Liability = DS_HhBS_Liability[list(TempDict.keys())].rename(columns=TempDict) \
                    .set_index(['Date','Variable'])['Value'].unstack(level=1)
DS_HhBS_Liability['QDate'] = pd.to_datetime(DS_HhBS_Liability.index)
DS_HhBS_Liability = DS_HhBS_Liability.set_index('QDate')

DS_HhBS = DS_HhBS_Asset[['Total_Domestic','Total_Foreign']] \
          .rename(columns={'Total_Domestic': 'HhAsset_Domestic','Total_Foreign': 'HhAsset_Foreign'})
DS_HhBS['HhAsset_DomShare'] = DS_HhBS['HhAsset_Domestic']/DS_HhBS.sum(axis=1)*100
DS_HhBS['HhAsset_Domestic'] = np.log(DS_HhBS['HhAsset_Domestic'])*100
DS_HhBS['HhAsset_Foreign'] = np.log(DS_HhBS['HhAsset_Foreign'])*100
DS_HhBS = LogLin_Growth(DS_HhBS,LogLin='Lin')
# All the Liability Items are 0.

## GDP by Sectors
DS_GDPbySector = pd.read_csv(open("../Data/Aggregate/GDPbySector.csv",'rb'))

TempDict = {"REF_DATE": 'Date',"North American Industry Classification System (NAICS)": 'Sector',"VALUE": 'Value', \
            'Seasonal adjustment': 'SA'}
DS_GDPbySector = DS_GDPbySector[list(TempDict.keys())].rename(columns=TempDict)
DS_GDPbySector['QDate'] = DS_GDPbySector['Date'].map(lambda x: datetime.datetime(int(x.split('-')[0]),int(x.split('-')[1]),1))

DS_GDPbySector = DS_GDPbySector.pivot(index='QDate',columns='Sector',values='Value')

SectorDict = pd.read_excel(open("../Data/Aggregate/SectorCategories.xls",'rb'))
SectorDict = {'N': SectorDict.loc[SectorDict['N_T']=='N','Sector'].values.tolist(), \
              'T': SectorDict.loc[SectorDict['N_T']=='T','Sector'].values.tolist()}

DS_GDPbySector = pd.concat([DS_GDPbySector[SectorDict[vv]].sum(axis=1) for vv in SectorDict.keys()], \
                            axis=1,keys=SectorDict.keys())

DS_GDPbySector = LogLin_Growth(DS_GDPbySector,LogLin='Log')

## Controls: Total Labor Income, Commodity Price

## Merged
DS_CA_List = [DS_GDP_CA,DS_Cons_CA,DS_Export_CA,DS_Import_CA, \
              DS_NetExportIdx_CA, DS_NomRate_CA, DS_CPI_CA, DS_LaborIncome_CA, \
              DS_Ex, DS_ComPrice, DS_HhBS, DS_GDPbySector]
DS_CA = pd.concat(DS_CA_List,axis=1)
DS_CA.index = DS_CA.index.rename('QDate')

DS_CA.rename(columns={xx: xx+'_CA' for xx in DS_CA.columns},inplace=True)

#%% Aggregate Variables: US

## NIPA
# GDP
DS_GDP_US = fred.get_series('GDPC1').rename('GDP')
DS_GDP_US = LogLin_Growth(DS_GDP_US,SA='NSA')

# Consumption
DS_Cons_US = fred.get_series('ND000349Q').rename('Consumption')
DS_Cons_US = LogLin_Growth(DS_Cons_US)

## Prices
# Nominal Rate
DS_NomRate_US = fred.get_series('TB3MS').rename('NominalRate').to_frame()
DS_NomRate_US['QDate'] = DS_NomRate_US.index.map(Date2QDate)
DS_NomRate_US = DS_NomRate_US.groupby('QDate')['NominalRate'].mean()
DS_NomRate_US = LogLin_Growth(DS_NomRate_US,LogLin='Lin',SA='NSA')
# CPI and Inflation
DS_CPI_US = fred.get_series('CPALTT01USQ661S').rename('CPI').to_frame()
DS_CPI_US['Inflation'] = np.log(DS_CPI_US['CPI']).diff()*100
DS_CPI_US = LogLin_Growth(DS_CPI_US['Inflation'],LogLin='Lin')
DS_CPI_US['CPI'] = DS_CPI_US['Inflation'].cumsum()
DS_CPI_US['Diff_CPI'] = DS_CPI_US['Inflation']

## Merged
DS_US_List = [DS_GDP_US,DS_Cons_US,DS_NomRate_US,DS_CPI_US]
DS_US = pd.concat(DS_US_List,axis=1)
DS_US.index = DS_US.index.rename('QDate')

DS_US.rename(columns={xx: xx+'_US' for xx in DS_US.columns},inplace=True)

#%% Aggregate Variables: Merged
DS_Agg = pd.concat([DS_CA,DS_US],axis=1)

pickle.dump(DS_Agg,open('Temp\DS_Agg.py','wb'))


